cancer_df = read_csv("./data/Cancer_Registry.csv") %>%
janitor::clean_names() %>%
select(target_death_rate, everything()) %>%
separate(geography, c("county", "state"), sep = ",") %>%
mutate(county = as.factor(county),
state = as.factor(state),
pct_case_count = avg_ann_count / pop_est2015*100000) %>%
select(target_death_rate, pct_case_count, everything())
## Parsed with column specification:
## cols(
## .default = col_double(),
## binnedInc = col_character(),
## Geography = col_character()
## )
## See spec(...) for full column specifications.
Percentage of annul case dignosed count plot
plot_count_pct =
cancer_df %>%
ggplot(aes(y = pct_case_count, x = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_count_pct)
Incidence rate plot
plot_incidence =
cancer_df %>%
ggplot(aes(x = incidence_rate, y = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_incidence)
# Influential points in the dataset, state Flordia and Virginia.
Income plot
plot_income =
cancer_df %>%
ggplot(aes(x = med_income, y = target_death_rate, color = state)) +
geom_point()
#geom_smooth(se = F)
ggplotly(plot_income)
Age plots
plot_age_1 =
cancer_df %>%
ggplot(aes(x = median_age, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# error data in this column, larger than 100
cancer_df %>%
filter(median_age < 100) %>%
ggplot(aes(x = median_age)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot_age_2 =
cancer_df %>%
ggplot(aes(x = median_age_male, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_age_3 =
cancer_df %>%
ggplot(aes(x = median_age_female, y = target_death_rate)) +
geom_point() +
geom_smooth(se = F)
ggplotly(plot_age_3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cancer_df %>%
select(-county, -state, -binned_inc) %>%
cor() %>%
as.tibble()
## # A tibble: 33 x 33
## target_death_ra… pct_case_count avg_ann_count avg_deaths_per_…
## <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.0578 -0.144 -0.0907
## 2 -0.0578 1 0.161 -0.0589
## 3 -0.144 0.161 1 0.939
## 4 -0.0907 -0.0589 0.939 1
## 5 0.449 0.0225 0.0736 0.0627
## 6 -0.429 0.0291 0.269 0.223
## 7 -0.120 -0.0518 0.927 0.978
## 8 0.429 -0.123 -0.136 -0.0669
## 9 -0.0223 -0.00481 0.0821 0.0635
## 10 0.00438 0.0375 -0.0241 -0.0246
## # ... with 23 more rows, and 29 more variables: incidence_rate <dbl>,
## # med_income <dbl>, pop_est2015 <dbl>, poverty_percent <dbl>,
## # study_per_cap <dbl>, median_age <dbl>, median_age_male <dbl>,
## # median_age_female <dbl>, avg_household_size <dbl>,
## # percent_married <dbl>, pct_no_hs18_24 <dbl>, pct_hs18_24 <dbl>,
## # pct_some_col18_24 <dbl>, pct_bach_deg18_24 <dbl>, pct_hs25_over <dbl>,
## # pct_bach_deg25_over <dbl>, pct_employed16_over <dbl>,
## # pct_unemployed16_over <dbl>, pct_private_coverage <dbl>,
## # pct_private_coverage_alone <dbl>, pct_emp_priv_coverage <dbl>,
## # pct_public_coverage <dbl>, pct_public_coverage_alone <dbl>,
## # pct_white <dbl>, pct_black <dbl>, pct_asian <dbl>,
## # pct_other_race <dbl>, pct_married_households <dbl>, birth_rate <dbl>
lm(target_death_rate ~ avg_ann_count * pop_est2015, data = cancer_df) %>%
summary()
##
## Call:
## lm(formula = target_death_rate ~ avg_ann_count * pop_est2015,
## data = cancer_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -121.215 -17.194 -0.065 15.969 182.511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.812e+02 5.692e-01 318.377 < 2e-16 ***
## avg_ann_count -4.219e-03 9.348e-04 -4.513 6.63e-06 ***
## pop_est2015 -2.315e-06 4.606e-06 -0.503 0.615
## avg_ann_count:pop_est2015 4.743e-10 1.061e-10 4.468 8.17e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.37 on 3043 degrees of freedom
## Multiple R-squared: 0.02817, Adjusted R-squared: 0.02721
## F-statistic: 29.4 on 3 and 3043 DF, p-value: < 2.2e-16